//计算向量的内积程序
 
#include<stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h>

#define IMIN(a,b)    ((a)<(b)?a:b)
//计算0,1,...N-1的平方和
#define sum_squares(x)   (x*(x+1)*(2*x+1)/6)
 
//N为输入的向量的规模
const int N = 33*1024;
const int threadsPerBlock = 256;
const int blocksPerGrid = IMIN(32, (N + threadsPerBlock - 1) / threadsPerBlock);
 
__global__ void dot(float *a,
                    float *b,
                    float *c)
{
    //每一个块上都有cache变量的拷贝，相互之间不影响
    __shared__ float cache[threadsPerBlock];
    //tid为线程的偏移量
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;
    float temp = 0;

    while(tid<N) {
        temp += a[tid]*b[tid];
        //增加的下标量为进程总数
        tid += blockDim.x*gridDim.x;
    }

    cache[cacheIndex]=temp;
    //同步化当前块上的线程
    __syncthreads();
 
    int i = blockDim.x/2;
    //在块内计算部分和
    while(i!=0) {
        if(cacheIndex<i) {
            cache[cacheIndex]+=cache[cacheIndex+i];
        }
        __syncthreads();
        i /= 2;
    }
 
    if(cacheIndex == 0)
        c[blockIdx.x] = cache[0];
}
 
 
int main(void)
{
 
    float *a,*b,c,*partial_c;
 
    float *dev_a,*dev_b,*dev_partial_c;
 
    a=(float*)malloc(N*sizeof(float));
    b=(float*)malloc(N*sizeof(float));
    partial_c=(float*)malloc(blocksPerGrid*sizeof(float));
 
    cudaMalloc((void**)&dev_a, N*sizeof(float));
    cudaMalloc((void**)&dev_b, N*sizeof(float));
    cudaMalloc((void**)&dev_partial_c, blocksPerGrid*sizeof(float));
 
    for(int i=0;i<N;i++) {
 
        a[i]=i;
        b[i]=i*2;
    }
 
    cudaMemcpy(dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice);
 
    dot<<<blocksPerGrid,threadsPerBlock>>>(dev_a, dev_b, dev_partial_c);
 
    cudaMemcpy(partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost);
 
    c=0;
    for(int i=0; i<blocksPerGrid; i++) {
        c+=partial_c[i];
    }

    //测试向量内积的正确性
    printf("Does GPU value %.6g = %.6g?\n",c,
        2*sum_squares((float)(N-1)));
 
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_partial_c);
 
    free(a);
    free(b);
    free(partial_c);

    //测试输出
    int j;
    scanf("%d",&j);
}
